packages

pacman::p_load(dplyr, ggplot2, googlesheets, openxlsx, stringr, rvest, dplyr, ggplot2, h2o, caret, text2vec)
Installing package into 㤼㸱C:/Users/Workplace/Documents/R/win-library/3.4㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
also installing the dependencies 㤼㸱lambda.r㤼㸲, 㤼㸱futile.options㤼㸲, 㤼㸱RcppParallel㤼㸲, 㤼㸱data.table㤼㸲, 㤼㸱irlba㤼㸲, 㤼㸱futile.logger㤼㸲, 㤼㸱mlapi㤼㸲, 㤼㸱sparsepp㤼㸲

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/lambda.r_1.2.zip'
Content type 'application/zip' length 93201 bytes (91 KB)
downloaded 91 KB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/futile.options_1.0.0.zip'
Content type 'application/zip' length 17090 bytes (16 KB)
downloaded 16 KB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/RcppParallel_4.3.20.zip'
Content type 'application/zip' length 3378535 bytes (3.2 MB)
downloaded 3.2 MB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/data.table_1.10.4-3.zip'
Content type 'application/zip' length 1577200 bytes (1.5 MB)
downloaded 1.5 MB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/irlba_2.3.2.zip'
Content type 'application/zip' length 279956 bytes (273 KB)
downloaded 273 KB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/futile.logger_1.4.3.zip'
Content type 'application/zip' length 95441 bytes (93 KB)
downloaded 93 KB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/mlapi_0.1.0.zip'
Content type 'application/zip' length 72925 bytes (71 KB)
downloaded 71 KB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/sparsepp_0.2.0.zip'
Content type 'application/zip' length 144909 bytes (141 KB)
downloaded 141 KB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/text2vec_0.5.1.zip'
Content type 'application/zip' length 5140413 bytes (4.9 MB)
downloaded 4.9 MB
package ‘lambda.r’ successfully unpacked and MD5 sums checked
package ‘futile.options’ successfully unpacked and MD5 sums checked
package ‘RcppParallel’ successfully unpacked and MD5 sums checked
package ‘data.table’ successfully unpacked and MD5 sums checked
package ‘irlba’ successfully unpacked and MD5 sums checked
package ‘futile.logger’ successfully unpacked and MD5 sums checked
package ‘mlapi’ successfully unpacked and MD5 sums checked
package ‘sparsepp’ successfully unpacked and MD5 sums checked
package ‘text2vec’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\Workplace\AppData\Local\Temp\RtmpGsflLY\downloaded_packages

text2vec installed
package 㤼㸱text2vec㤼㸲 was built under R version 3.4.3

load data

df <- get(load("sub_sample2.Rdata"))

clean

clean_social_media <- function(x){
  x %>%
    str_replace_all("\n", " ") %>%
    str_to_lower() %>%
    ### Twitter specific
    str_replace_all("https?[:]//[[:graph:]]+", "URL") %>%
    str_replace_all("@(\\w+)", " HNDL") %>%
    str_replace_all("#(\\w+)", " HASH") %>%
    ### ALtright replacements
    str_replace_all("\\(+.?", "JEW ") %>%
    str_replace_all("\\)+", " ") %>%
    ### smilies
    str_replace_all(":-\\)|:\\)|\\(:|\\(-:", " EMO_SMILEY ") %>%
    str_replace_all(":-D|:D|X-D|XD|xD", " EMO_LAUGH ") %>%
    str_replace_all("<3|:\\*", "EMO_LOVE") %>%
    str_replace_all(";-\\)|;\\)|;-D|;D|\\(;|\\(-;", "EMO_WINK") %>%
    str_replace_all(":-\\(|:\\(|\\):|\\)-:", "EMO_FROWN") %>%
    str_replace_all(':,\\(|:"\\(|:\\(\\(', "EMO_CRY") %>%
    ### General
    str_replace_all("\\.|\\:|\\;", " PUNC_DOT ") %>%
    str_replace_all("\\!", " PUNC_EXCL ") %>%
    str_replace_all("\\?", " PUNC_QUES ") %>%
    str_replace_all("\\.\\.\\.", " PUNC_DOTS ") %>%
    str_trim() 
}
df <- df %>%
  mutate(ctext = clean_social_media(text))
package 㤼㸱bindrcpp㤼㸲 was built under R version 3.4.3

vectorize

load("vectorizer.Rdata")
### test
pred_it <- itoken(
  df$ctext, 
  ids = df$id,
  progressbar = F
)
pred_dtm <- create_dtm(pred_it, vectorizer)

predict

df_pred %>% split(., .[,"sp"])
$`1`

$`2`

$`3`

$`4`

$`5`

$`6`

$`7`

$`8`

$`9`

$`10`

$`11`

$`12`

$`13`

$`14`

$`15`

$`16`

$`17`

$`18`

$`19`

$`20`
NA
LS0tDQp0aXRsZTogIlByZWRpY3Qgb24gdW5zZWVuIGRhdGEiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQoNCiMjIHBhY2thZ2VzDQoNCmBgYHtyfQ0KcGFjbWFuOjpwX2xvYWQoZHBseXIsIGdncGxvdDIsIGdvb2dsZXNoZWV0cywgb3Blbnhsc3gsIHN0cmluZ3IsIHJ2ZXN0LCBkcGx5ciwgZ2dwbG90MiwgaDJvLCBjYXJldCwgdGV4dDJ2ZWMpDQpgYGANCg0KDQojIyBsb2FkIGRhdGENCg0KYGBge3J9DQpkZiA8LSBnZXQobG9hZCgic3ViX3NhbXBsZTIuUmRhdGEiKSkNCmBgYA0KDQoNCiMjIGNsZWFuIA0KDQpgYGB7cn0NCmNsZWFuX3NvY2lhbF9tZWRpYSA8LSBmdW5jdGlvbih4KXsNCg0KICB4ICU+JQ0KICAgIHN0cl9yZXBsYWNlX2FsbCgiXG4iLCAiICIpICU+JQ0KICAgIHN0cl90b19sb3dlcigpICU+JQ0KICAgICMjIyBUd2l0dGVyIHNwZWNpZmljDQogICAgc3RyX3JlcGxhY2VfYWxsKCJodHRwcz9bOl0vL1tbOmdyYXBoOl1dKyIsICJVUkwiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIkAoXFx3KykiLCAiIEhOREwiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIiMoXFx3KykiLCAiIEhBU0giKSAlPiUNCiAgICAjIyMgQUx0cmlnaHQgcmVwbGFjZW1lbnRzDQogICAgc3RyX3JlcGxhY2VfYWxsKCJcXCgrLj8iLCAiSkVXICIpICU+JQ0KICAgIHN0cl9yZXBsYWNlX2FsbCgiXFwpKyIsICIgIikgJT4lDQogICAgIyMjIHNtaWxpZXMNCiAgICBzdHJfcmVwbGFjZV9hbGwoIjotXFwpfDpcXCl8XFwoOnxcXCgtOiIsICIgRU1PX1NNSUxFWSAiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIjotRHw6RHxYLUR8WER8eEQiLCAiIEVNT19MQVVHSCAiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIjwzfDpcXCoiLCAiRU1PX0xPVkUiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIjstXFwpfDtcXCl8Oy1EfDtEfFxcKDt8XFwoLTsiLCAiRU1PX1dJTksiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIjotXFwofDpcXCh8XFwpOnxcXCktOiIsICJFTU9fRlJPV04iKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoJzosXFwofDoiXFwofDpcXChcXCgnLCAiRU1PX0NSWSIpICU+JQ0KICAgICMjIyBHZW5lcmFsDQogICAgc3RyX3JlcGxhY2VfYWxsKCJcXC58XFw6fFxcOyIsICIgUFVOQ19ET1QgIikgJT4lDQogICAgc3RyX3JlcGxhY2VfYWxsKCJcXCEiLCAiIFBVTkNfRVhDTCAiKSAlPiUNCiAgICBzdHJfcmVwbGFjZV9hbGwoIlxcPyIsICIgUFVOQ19RVUVTICIpICU+JQ0KICAgIHN0cl9yZXBsYWNlX2FsbCgiXFwuXFwuXFwuIiwgIiBQVU5DX0RPVFMgIikgJT4lDQogICAgIyMjIFdoaXRlIFNwYWNlDQogICAgc3RyX3JlcGxhY2VfYWxsKCJcXHMrIiwgIiAiKSAlPiUNCiAgICBzdHJfdHJpbSgpIA0KfQ0KDQoNCmRmIDwtIGRmICU+JQ0KICBtdXRhdGUoY3RleHQgPSBjbGVhbl9zb2NpYWxfbWVkaWEodGV4dCkpDQpgYGANCg0KDQoNCiMjIHZlY3Rvcml6ZQ0KDQpgYGB7cn0NCmxvYWQoInZlY3Rvcml6ZXIuUmRhdGEiKQ0KIyMjIHRlc3QNCnByZWRfaXQgPC0gaXRva2VuKA0KICBkZiRjdGV4dCwgDQogIGlkcyA9IGRmJGlkLA0KICBwcm9ncmVzc2JhciA9IEYNCikNCg0KcHJlZF9kdG0gPC0gY3JlYXRlX2R0bShwcmVkX2l0LCB2ZWN0b3JpemVyKQ0KYGBgDQoNCg0KIyMgcHJlZGljdA0KDQpgYGB7cn0NCmxpYnJhcnkoaDJvKQ0KIyMjIGluaXRpYWxpemUgYW4gaDJvIGluc3RhbmNlDQpoMm8uaW5pdChudGhyZWFkcyA9IC0xKQ0KZ2JtX2Jhc2UgPC0gaDJvLmxvYWRNb2RlbCgiR0JNX21vZGVsX1JfMTUxODA0MTM5MjY3Nl8xIikNCiMgZGV2dG9vbHM6Omluc3RhbGxfZ2l0aHViKCJoMm9haS9oMm8tMy9oMm8tci9lbnNlbWJsZS9oMm9FbnNlbWJsZS1wYWNrYWdlIikNCmgyb19wcmVkIDwtIGFzLmgybyhwcmVkX2R0bSkNCnByZWQxIDwtIGgyby5wcmVkaWN0KGdibV9iYXNlLCBoMm9fcHJlZCkgJT4lDQogIGFzLmRhdGEuZnJhbWUoKQ0KDQpkZl9wcmVkIDwtIGRhdGEuZnJhbWUoZGYsIHByZWQxKQ0KDQoNCmRmX3ByZWQgPC0gZGZfcHJlZCAlPiUgDQogIGZpbHRlcihwcmVkaWN0ID09IDEpICU+JSANCiAgbXV0YXRlKHNwID0gbnRpbGUoaWQsIG4gPSAxMCkpICU+JSANCiAgYXJyYW5nZShyYW4gPSBybm9ybShuKCkpKSANCiMgDQojIGZvcihqaiBpbiAxOjEwKXsNCiMgICBzYXZlKGRmX3ByZWQgJT4lIGZpbHRlcihzcCA9PSBqaiksIGZpbGUgPSAiIikNCiMgfQ0KDQpkZl9saXN0IDwtIGRmX3ByZWQgJT4lIHNwbGl0KC4sIC5bLCJzcCJdKQ0Kc2F2ZShkZl9saXN0LCBmaWxlID0gImRmX2xpc3QuUmRhdGEiKQ0KYGBgDQoNCg0K